import numpy as np
import pandas as pd
import calendar
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
order = pd.read_csv("https://raw.githubusercontent.com/abinash25/Sales-Analysis/refs/heads/main/List%20of%20Orders.csv")
order.head()
| Order ID | Order Date | CustomerName | State | City | |
|---|---|---|---|---|---|
| 0 | B-25601 | 01-04-2018 | Bharat | Gujarat | Ahmedabad |
| 1 | B-25602 | 01-04-2018 | Pearl | Maharashtra | Pune |
| 2 | B-25603 | 03-04-2018 | Jahan | Madhya Pradesh | Bhopal |
| 3 | B-25604 | 03-04-2018 | Divsha | Rajasthan | Jaipur |
| 4 | B-25605 | 05-04-2018 | Kasheen | West Bengal | Kolkata |
details = pd.read_csv("https://raw.githubusercontent.com/abinash25/Sales-Analysis/refs/heads/main/Order%20Details.csv")
details.head()
| Order ID | Amount | Profit | Quantity | Category | Sub-Category | |
|---|---|---|---|---|---|---|
| 0 | B-25601 | 1275.0 | -1148.0 | 7 | Furniture | Bookcases |
| 1 | B-25601 | 66.0 | -12.0 | 5 | Clothing | Stole |
| 2 | B-25601 | 8.0 | -2.0 | 3 | Clothing | Hankerchief |
| 3 | B-25601 | 80.0 | -56.0 | 4 | Electronics | Electronic Games |
| 4 | B-25602 | 168.0 | -111.0 | 2 | Electronics | Phones |
target = pd.read_csv("https://raw.githubusercontent.com/abinash25/Sales-Analysis/refs/heads/main/Sales%20target.csv")
target.head()
| Month of Order Date | Category | Target | |
|---|---|---|---|
| 0 | Apr-18 | Furniture | 10400.0 |
| 1 | May-18 | Furniture | 10500.0 |
| 2 | Jun-18 | Furniture | 10600.0 |
| 3 | Jul-18 | Furniture | 10800.0 |
| 4 | Aug-18 | Furniture | 10900.0 |
# Cleaning the order dataset
order.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 560 entries, 0 to 559 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order ID 500 non-null object 1 Order Date 500 non-null object 2 CustomerName 500 non-null object 3 State 500 non-null object 4 City 500 non-null object dtypes: object(5) memory usage: 22.0+ KB
# Changing the Order Date variable to datetime data type
order['Order Date'] = order['Order Date'].astype('datetime64[ns]')
# Checking null values
order.isnull().sum()
Order ID 60 Order Date 60 CustomerName 60 State 60 City 60 dtype: int64
# Droping Null Values
order = order.dropna()
order.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 500 entries, 0 to 499 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order ID 500 non-null object 1 Order Date 500 non-null datetime64[ns] 2 CustomerName 500 non-null object 3 State 500 non-null object 4 City 500 non-null object dtypes: datetime64[ns](1), object(4) memory usage: 23.4+ KB
# Cleaning the detail dataset
details.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1500 entries, 0 to 1499 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order ID 1500 non-null object 1 Amount 1500 non-null float64 2 Profit 1500 non-null float64 3 Quantity 1500 non-null int64 4 Category 1500 non-null object 5 Sub-Category 1500 non-null object dtypes: float64(2), int64(1), object(3) memory usage: 70.4+ KB
# Null values
details.isnull().sum()
Order ID 0 Amount 0 Profit 0 Quantity 0 Category 0 Sub-Category 0 dtype: int64
# Chaning the Category and Sub-category variable to categorical data type
details['Category'] = details['Category'].astype('category')
details['Sub-Category'] = details['Sub-Category'].astype('category')
details.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1500 entries, 0 to 1499 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order ID 1500 non-null object 1 Amount 1500 non-null float64 2 Profit 1500 non-null float64 3 Quantity 1500 non-null int64 4 Category 1500 non-null category 5 Sub-Category 1500 non-null category dtypes: category(2), float64(2), int64(1), object(1) memory usage: 50.7+ KB
# Cleaning Target dataset
target.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36 entries, 0 to 35 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Month of Order Date 36 non-null object 1 Category 36 non-null object 2 Target 36 non-null float64 dtypes: float64(1), object(2) memory usage: 992.0+ bytes
# Coverting Category variable to category data
target['Category'] = target['Category'].astype('category')
target.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36 entries, 0 to 35 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Month of Order Date 36 non-null object 1 Category 36 non-null category 2 Target 36 non-null float64 dtypes: category(1), float64(1), object(1) memory usage: 872.0+ bytes
# Cleanded Details data
details.head()
| Order ID | Amount | Profit | Quantity | Category | Sub-Category | |
|---|---|---|---|---|---|---|
| 0 | B-25601 | 1275.0 | -1148.0 | 7 | Furniture | Bookcases |
| 1 | B-25601 | 66.0 | -12.0 | 5 | Clothing | Stole |
| 2 | B-25601 | 8.0 | -2.0 | 3 | Clothing | Hankerchief |
| 3 | B-25601 | 80.0 | -56.0 | 4 | Electronics | Electronic Games |
| 4 | B-25602 | 168.0 | -111.0 | 2 | Electronics | Phones |
# Cleaned Order Data
order.head()
| Order ID | Order Date | CustomerName | State | City | |
|---|---|---|---|---|---|
| 0 | B-25601 | 2018-01-04 | Bharat | Gujarat | Ahmedabad |
| 1 | B-25602 | 2018-01-04 | Pearl | Maharashtra | Pune |
| 2 | B-25603 | 2018-03-04 | Jahan | Madhya Pradesh | Bhopal |
| 3 | B-25604 | 2018-03-04 | Divsha | Rajasthan | Jaipur |
| 4 | B-25605 | 2018-05-04 | Kasheen | West Bengal | Kolkata |
# Cleaned Target Dataset
target.head()
| Month of Order Date | Category | Target | |
|---|---|---|---|
| 0 | Apr-18 | Furniture | 10400.0 |
| 1 | May-18 | Furniture | 10500.0 |
| 2 | Jun-18 | Furniture | 10600.0 |
| 3 | Jul-18 | Furniture | 10800.0 |
| 4 | Aug-18 | Furniture | 10900.0 |
Making a new dataframe containing the Amount, Profit and Quantity of the different orders. Then joining it with the Order datasets by taking Order ID as the Primary Key.
profits = details.groupby('Order ID').sum().reset_index()
profits.head()
| Order ID | Amount | Profit | Quantity | |
|---|---|---|---|---|
| 0 | B-25601 | 1429.0 | -1218.0 | 19 |
| 1 | B-25602 | 3889.0 | 975.0 | 22 |
| 2 | B-25603 | 2025.0 | -180.0 | 25 |
| 3 | B-25604 | 222.0 | 22.0 | 11 |
| 4 | B-25605 | 75.0 | 0.0 | 7 |
df = pd.merge(order, profits)
df.head()
| Order ID | Order Date | CustomerName | State | City | Amount | Profit | Quantity | |
|---|---|---|---|---|---|---|---|---|
| 0 | B-25601 | 2018-01-04 | Bharat | Gujarat | Ahmedabad | 1429.0 | -1218.0 | 19 |
| 1 | B-25602 | 2018-01-04 | Pearl | Maharashtra | Pune | 3889.0 | 975.0 | 22 |
| 2 | B-25603 | 2018-03-04 | Jahan | Madhya Pradesh | Bhopal | 2025.0 | -180.0 | 25 |
| 3 | B-25604 | 2018-03-04 | Divsha | Rajasthan | Jaipur | 222.0 | 22.0 | 11 |
| 4 | B-25605 | 2018-05-04 | Kasheen | West Bengal | Kolkata | 75.0 | 0.0 | 7 |
Trend analysis is to find patterns in data, such as ups & downs. A “trend” is an upwards or downwards shift in a data set over time. In retail, this analysis of past trends in sales or revenue; allows to predict the future market. This analysis useful for budgeting and forecasting. Total sales of any business on a trend line may obtain some significant information.
df['Year'] = pd.DatetimeIndex(df['Order Date']).year
df['Month_Number'] = pd.DatetimeIndex(df['Order Date']).month
df['Month'] = df['Month_Number'].apply(lambda x: calendar.month_abbr[x])
year_month = df.groupby(['Year', 'Month','Month_Number']).sum().sort_values(['Year','Month_Number'])
year_month
| Amount | Profit | Quantity | |||
|---|---|---|---|---|---|
| Year | Month | Month_Number | |||
| 2018 | Jan | 1 | 18035.0 | -3296.0 | 203 |
| Feb | 2 | 6566.0 | 685.0 | 58 | |
| Mar | 3 | 7434.0 | 669.0 | 144 | |
| Apr | 4 | 26170.0 | -1043.0 | 337 | |
| May | 5 | 20422.0 | -891.0 | 306 | |
| Jun | 6 | 17406.0 | -3759.0 | 353 | |
| Jul | 7 | 15682.0 | -2065.0 | 239 | |
| Aug | 8 | 45269.0 | -1059.0 | 601 | |
| Sep | 9 | 20210.0 | -3509.0 | 310 | |
| Oct | 10 | 32758.0 | 5979.0 | 414 | |
| Nov | 11 | 38858.0 | 4955.0 | 433 | |
| Dec | 12 | 23892.0 | 1535.0 | 209 | |
| 2019 | Jan | 1 | 50448.0 | 8655.0 | 640 |
| Feb | 2 | 15894.0 | 2291.0 | 253 | |
| Mar | 3 | 39700.0 | 6633.0 | 485 | |
| Apr | 4 | 11079.0 | 1295.0 | 106 | |
| May | 5 | 4390.0 | 943.0 | 63 | |
| Jun | 6 | 3392.0 | 700.0 | 52 | |
| Jul | 7 | 5116.0 | 975.0 | 67 | |
| Aug | 8 | 6557.0 | 594.0 | 83 | |
| Sep | 9 | 5583.0 | 1597.0 | 70 | |
| Oct | 10 | 14147.0 | 1892.0 | 149 | |
| Nov | 11 | 2235.0 | 122.0 | 35 | |
| Dec | 12 | 259.0 | 57.0 | 5 |
year_month = year_month.reset_index()
year_month["Color"] = np.where(year_month["Profit"]<0, 'Loss', 'Profit')
year_month_2018 = year_month[year_month['Year']==2018]
fig = px.bar(year_month_2018, x='Month_Number', y='Profit', color='Color',
title="Monthly Profit in 2018",
labels=dict(Month_Number="Month", Profit="Profit", Color="Results"),
color_discrete_map={
'Loss': '#EC2049',
'Profit': '#2F9599'},
hover_data=["Month", "Profit"],
template='plotly_white')
fig.update_layout(yaxis_tickprefix = '₹', yaxis_tickformat = ',.2f')
fig.update_layout(
xaxis = dict(
tickvals = [1, 2, 3, 4, 5, 6, 7,8 ,9, 10, 11, 12],
ticktext = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
)
)
fig.show()
year_month_2019 = year_month[year_month['Year']==2019]
fig = px.bar(year_month_2019, x='Month_Number', y='Profit', color='Color',
title="Monthly Profit in 2019",
labels=dict(Month_Number="Month", Profit="Profit", Color="Results"),
color_discrete_map={
'Loss': '#EC2049',
'Profit': '#2F9599'},
hover_data=["Month", "Profit"],
template='plotly_white')
fig.update_layout(yaxis_tickprefix = '₹', yaxis_tickformat = ',.2f')
fig.update_layout(
xaxis = dict(
tickvals = [1, 2, 3, 4, 5, 6, 7,8 ,9, 10, 11, 12],
ticktext = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
)
)
fig.show()
orders_by_state = order.groupby(['State']).size().reset_index(name='Total Orders').sort_values(['Total Orders'])
orders_by_state
| State | Total Orders | |
|---|---|---|
| 16 | Tamil Nadu | 8 |
| 15 | Sikkim | 12 |
| 3 | Goa | 14 |
| 5 | Haryana | 14 |
| 6 | Himachal Pradesh | 14 |
| 7 | Jammu and Kashmir | 14 |
| 0 | Andhra Pradesh | 15 |
| 12 | Nagaland | 15 |
| 9 | Kerala | 16 |
| 1 | Bihar | 16 |
| 8 | Karnataka | 21 |
| 17 | Uttar Pradesh | 22 |
| 2 | Delhi | 22 |
| 18 | West Bengal | 22 |
| 13 | Punjab | 25 |
| 4 | Gujarat | 27 |
| 14 | Rajasthan | 32 |
| 11 | Maharashtra | 90 |
| 10 | Madhya Pradesh | 101 |
profit_by_state = df.groupby('State').sum().reset_index().sort_values(['Profit'])
profit_by_state["Color"] = np.where(profit_by_state["Profit"]<0, 'Loss', 'Profit')
fig = px.bar(profit_by_state, x='State', y='Profit',
color='Color', color_discrete_map={
'Loss': '#EC2049',
'Profit': '#2F9599'},
title="Profit by State",
labels=dict(Color="Results"),
template='plotly_white')
# Disabling Zoom
fig.layout.xaxis.fixedrange = True
fig.layout.yaxis.fixedrange = True
fig.update_layout(yaxis_tickprefix = '₹', yaxis_tickformat = ',.2f')
fig.update_xaxes(
tickangle = -90,
title_text = "States",
)
fig.show()
top_customers = df.groupby('CustomerName').sum().reset_index().sort_values(['Quantity'], ascending=False).head(5)
colors = ['lightslategray',] * 5
colors[0] = 'crimson'
fig = go.Figure(data=[go.Bar(
x=top_customers['CustomerName'],
y=top_customers['Quantity'],
marker_color=colors # marker color can be a single color value or an iterable,
)])
fig.update_layout(title_text='Top 5 Customers',
template='plotly_white')
fig.update_xaxes(title_text='Customers')
fig.update_yaxes(title_text='Total Orders')
fig.show()
details_category = details.groupby('Category').sum().reset_index()
fig = px.pie(details_category, values='Quantity', names='Category', color='Category',
color_discrete_map={'Clothing':'cyan',
'Electronics':'royalblue',
'Furniture':'darkblue'},
title='Total Quantity Sold per Category')
fig.show()
details_subcategory = details.groupby('Sub-Category').sum().reset_index()
fig = px.pie(details_subcategory, values='Quantity', names='Sub-Category', color='Sub-Category',
title='Total Quantity Sold per Sub-Category')
fig.show()
date_orders = order.groupby('Order Date').size().reset_index(name="Orders")
date_orders['Month'] = pd.DatetimeIndex(date_orders['Order Date']).month
date_orders['Year'] = pd.DatetimeIndex(date_orders['Order Date']).year
date_orders_2018 = date_orders[date_orders['Year']==2018]
date_orders_2019 = date_orders[date_orders['Year']==2019]
month_2018 = date_orders_2018.groupby('Month').sum().reset_index()
month_2019 = date_orders_2019.groupby('Month').sum().reset_index()
fig = go.Figure()
fig.add_trace(go.Scatter(
name='2018',
x=month_2018['Month'],
y=month_2018['Orders'],
connectgaps=True # override default to connect the gaps
))
fig.add_trace(go.Scatter(
name='2019',
x=month_2019['Month'],
y=month_2019['Orders'],
connectgaps=True # override default to connect the gaps
))
fig.update_layout(title_text='Monthly Quantity Sold',
template='plotly_dark')
fig.update_xaxes(title_text='Time')
fig.update_yaxes(title_text='Orders')
fig.update_layout(
xaxis = dict(
tickvals = [1, 2, 3, 4, 5, 6, 7,8 ,9, 10, 11, 12],
ticktext = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
)
)
fig.layout.xaxis.fixedrange = True
fig.layout.yaxis.fixedrange = True
fig.show()
Customer demographics are categories of consumer populations that are relevant to a business' purposes, such as marketing and product design. The term also refers to the study of such categories in a business context.
fig = px.bar(orders_by_state, y='State', x='Total Orders',
title="Total Orders by State",
color_discrete_sequence=["springgreen"],
template='plotly_white')
# Disabling Zoom
fig.layout.xaxis.fixedrange = True
fig.layout.yaxis.fixedrange = True
fig.show()
#
orders_by_city = order.groupby(['City']).size().reset_index(name='Total Orders').sort_values(['Total Orders'])
fig = px.bar(orders_by_city, y='City', x='Total Orders',
title="Total Orders by City",
template='simple_white')
fig.layout.yaxis.tickmode='linear'
# Disabling Zoom
fig.layout.xaxis.fixedrange = True
fig.layout.yaxis.fixedrange = True
fig.show()
The state with the highest quantity sold Madhya Pradesh, followed by Maharastra and Rajasthan. There is a biggest gap between the quantity sold in Maharastra an Rajasthan with a difference of 58 units. While in case of Cities, it is Indore and Mumbai by a very wide margin. Chennai, Allahabad and Amritsar have the lowest quantity sold with less than 10 units sold.
A sales target is a goal set for a salesperson or sales department measured in revenue or units sold for a specific time.
target_category = target.groupby('Category').max().reset_index()
details_category = details.groupby('Category').sum().reset_index()
target_category['Actual_Amount'] = details_category['Profit']
fig = go.Figure(data=[
go.Bar(name='Target', x=target_category['Category'], y=target_category['Target'],
marker_color='#2b2d42'),
go.Bar(name='Actual Amount', x=target_category['Category'], y=target_category['Actual_Amount'],
marker_color='#d90429')
])
fig.update_layout(title_text='Actual vs Target Sales',
template='plotly_white')
fig.update_xaxes(title_text='Categories')
fig.update_yaxes(title_text='Amount')
fig.update_layout(yaxis_tickprefix = '₹', yaxis_tickformat = ',.2f')
fig.layout.xaxis.fixedrange = True
fig.layout.yaxis.fixedrange = True
fig.show()
The above bar graph illustrates the Target and the Actual Amount of Profits per Quantity. None of the category surpass or even meet the target. The most disappointing Category is Furniture, which managed to have a profit of Rs. 2298 with the target of Rs. 11.8K
Cluster analysis uses mathematical models to discover groups of similar customers based on the smallest variations among customers within each group.
Cluster analysis is the use of a mathematical model to discover groups of similar customers based on finding the smallest variations among customers within each group. The goal of cluster analysis in marketing is to accurately segment customers in order to achieve more effective customer marketing via personalization. A common cluster analysis method is a mathematical algorithm known as k-means cluster analysis, sometimes referred to as scientific segmentation. The clusters that result assist in better customer modeling and predictive analytics, and are also are used to target customers with offers and incentives personalized to their wants, needs and preferences.
customer_seg = df.groupby('CustomerName').sum().reset_index()
customer_seg = customer_seg[['CustomerName', 'Amount', 'Quantity']]
customer_seg.head()
| CustomerName | Amount | Quantity | |
|---|---|---|---|
| 0 | Aakanksha | 74.0 | 8 |
| 1 | Aarushi | 4701.0 | 49 |
| 2 | Aashna | 1931.0 | 32 |
| 3 | Aastha | 3276.0 | 28 |
| 4 | Aayush | 556.0 | 18 |
# Standardizing
customer_seg2 = customer_seg[['Amount', 'Quantity']]
scaler = StandardScaler()
scaler.fit(customer_seg2)
customers_normalized = scaler.transform(customer_seg2)
customers_normalized
# Elbow Method to find best number of clusters
sse = {}
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(customers_normalized)
sse[k] = kmeans.inertia_ # SSE to closest cluster centroid
# Plotting SSE
fig = go.Figure()
fig.add_trace(go.Scatter(
x=list(sse.keys()),
y=list(sse.values()),
connectgaps=True # override default to connect the gaps
))
fig.update_layout(title_text='The Elbow Method',
template='plotly_white')
fig.update_xaxes(title_text='k')
fig.update_yaxes(title_text='SSE')
fig.layout.xaxis.fixedrange = True
fig.layout.yaxis.fixedrange = True
fig.show()
C:\Users\ABINASH\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:881: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
The k-value of 3 is the best hyperparameter for our model because the next k-value tend to have a linear trend.
# KMeans
model = KMeans(n_clusters=3)
model.fit(customers_normalized)
customer_seg['Cluster'] = model.labels_ + 1
customer_seg['Cluster'] = customer_seg['Cluster'].astype('category')
customer_seg.head()
| CustomerName | Amount | Quantity | Cluster | |
|---|---|---|---|---|
| 0 | Aakanksha | 74.0 | 8 | 3 |
| 1 | Aarushi | 4701.0 | 49 | 2 |
| 2 | Aashna | 1931.0 | 32 | 1 |
| 3 | Aastha | 3276.0 | 28 | 1 |
| 4 | Aayush | 556.0 | 18 | 3 |
customer_seg.groupby('Cluster').agg({
'Amount':'mean',
'Quantity':'count'}).round(2)
| Amount | Quantity | |
|---|---|---|
| Cluster | ||
| 1 | 1989.32 | 96 |
| 2 | 5237.32 | 28 |
| 3 | 451.36 | 208 |
fig = px.scatter(customer_seg, x="Quantity", y="Amount",
color="Cluster",
template='plotly_white',
title="Amount vs Quantity - Customer Segmentation")
fig.layout.xaxis.fixedrange = True
fig.layout.yaxis.fixedrange = True
fig.show()
Segment 1: Medium Buyers
Segment 2: Loyal Buyers
Segment 3: Occational Buyers